{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://guidesify.com/blog/2017/08/13/singlish-phrases-define-singapore/\n",
    "\n",
    "manglish_vocab = {\n",
    "    'siasui', 'lah', 'chun', 'kapster', 'leh', 'lansi', 'lan si', 'meh',\n",
    "    'stim', 'maluation', 'kantoi', 'seh', 'yam', 'hor', 'la', 'cha',\n",
    "    'tao', 'amoi', 'aiya', 'angmor', 'angpau', 'beng', 'chow', 'batam',\n",
    "    'liao', 'nian', 'buiji', 'hou', 'guo', 'jiang', 'chiu', 'buji',\n",
    "    'hao', 'kam', 'wan', 'yao', 'cao', 'ciao', 'jin', 'hoseh',\n",
    "    'jiak', 'ying', 'leybit', 'sibei', 'laobu', 'sia', 'cilok',\n",
    "    'cibai', 'cb', 'entao', 'gwai', 'kai', 'kongmong', 'kapcai',\n",
    "    'lanjiao', 'lancau', 'lalazai', 'momantai', 'paikia', 'paiseh',\n",
    "    'pokai', 'seow', 'sohai', 'sueh', 'tapau', 'wor', 'hor',\n",
    "    'terrer', 'chop', 'lansi', 'paiseh', 'syok', 'shiok',\n",
    "    'sibeh', 'kawkaw', 'abuden', 'mah', 'lor', 'paiseh',\n",
    "    'niang', 'aiya', 'kena', 'aiyo', 'moh', 'bojio',\n",
    "    'buay', 'kia', 'chao', 'chim', 'cheem', 'chiong',\n",
    "    'chiobu', 'dabao', 'kiang', 'hosei', 'hoseh',\n",
    "    'jialat', 'kaypoh', 'kenasai', 'liddat', 'machiam',\n",
    "    'nehmind', 'pokkai', 'shiok', 'siol', 'smlj', 'suan',\n",
    "    'suay', 'swaku', 'swee', 'tyco', 'wapiang', 'walao', 'zhun',\n",
    "    'walau', 'ngaidi', 'ngaidiao', 'jilo', 'kampung',\n",
    "    'kaobu', 'siao', 'obiang', 'orredy', 'oredy',\n",
    "    'pantang', 'pokai', 'siam', 'tangi', 'talk cock', 'tombalek',\n",
    "    'womit', 'yandao'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "r = requests.get('https://en.wikipedia.org/wiki/Singlish_vocabulary')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "\n",
    "soup = BeautifulSoup(r._content, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "PUNCTUATION = '!\"#$%&\\'()*+,./:;<=>?@[\\\\]^_`{|}~'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "559"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = [b_.text for b_ in soup.find_all('b')]\n",
    "b = [b_ for b_ in b if len(b_) >= 2]\n",
    "flatten = []\n",
    "for b_ in b:\n",
    "    flatten.extend(b_.split('/'))\n",
    "flatten = [b_.strip() for b_ in flatten]\n",
    "cleaned = []\n",
    "for b in flatten:\n",
    "    for c in PUNCTUATION:\n",
    "        b = b.replace(c, '')\n",
    "    if len(b):\n",
    "        cleaned.append(b.lower())\n",
    "len(cleaned)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "cleaned = ['soc',\n",
    " '4d',\n",
    " '5cs',\n",
    " '11b',\n",
    " 'singapore armed forces identity card',\n",
    " 'abit',\n",
    " 'abuden',\n",
    " 'acbc',\n",
    " 'act blur',\n",
    " 'act cute',\n",
    " 'agak-ration',\n",
    " 'ah beng',\n",
    " 'ah seng',\n",
    " 'ah gua',\n",
    " 'ah kwa',\n",
    " 'ah qua',\n",
    " 'ah lian',\n",
    " 'ah huay',\n",
    " 'ah long',\n",
    " 'ah neh',\n",
    " 'ah pu neh neh',\n",
    " 'ah tiong',\n",
    " 'ai see',\n",
    " 'ai see buay see',\n",
    " 'aiseh',\n",
    " 'ai swee mai mniah',\n",
    " 'ai sui',\n",
    " 'ai tzai',\n",
    " 'aiyah',\n",
    " 'aiyoh',\n",
    " 'aiyoyo',\n",
    " 'ai-yoh-yoh',\n",
    " 'ang moh',\n",
    " 'ang moh pai',\n",
    " 'ang pau',\n",
    " 'ang pow',\n",
    " 'an zhua',\n",
    " 'ar',\n",
    " 'ar bo',\n",
    " 'ay',\n",
    " 'aye',\n",
    " 'ayy',\n",
    " 'bakkwa',\n",
    " 'bak kwa',\n",
    " 'bao toh',\n",
    " 'barang barang',\n",
    " 'basket',\n",
    " 'berak',\n",
    " 'belanja',\n",
    " 'bird bird',\n",
    " 'boh beh zao',\n",
    " 'boh bian',\n",
    " 'boh pien',\n",
    " 'boh chup',\n",
    " 'boh eng',\n",
    " 'boh gay',\n",
    " 'boh geh',\n",
    " 'boh jio',\n",
    " 'bojio',\n",
    " 'boh liao',\n",
    " 'boh ta bo lan pa',\n",
    " 'boh tao boh beh',\n",
    " 'boh zheng hu',\n",
    " 'buay',\n",
    " 'buay pai',\n",
    " 'buay song',\n",
    " 'buay gan',\n",
    " 'buay steady',\n",
    " 'buay tahan',\n",
    " 'catch no ball',\n",
    " 'cert',\n",
    " 'chao',\n",
    " 'chow',\n",
    " 'chao keng',\n",
    " 'chao mugger',\n",
    " 'chap lau chu',\n",
    " 'char bor',\n",
    " 'zha bo',\n",
    " 'chee bai',\n",
    " 'chee ko pek',\n",
    " 'cheena',\n",
    " 'cher',\n",
    " 'tcher',\n",
    " 'chey',\n",
    " 'cheyyy',\n",
    " 'chicken business',\n",
    " 'chi fan',\n",
    " 'chim',\n",
    " 'cheem',\n",
    " 'chin chai',\n",
    " 'chio bu',\n",
    " 'chiong',\n",
    " 'chiong sua',\n",
    " 'chop',\n",
    " 'chop chop',\n",
    " 'chope',\n",
    " 'chui',\n",
    " 'chiu kana kah kah kana lum pah',\n",
    " 'cmi',\n",
    " 'cockanaathan',\n",
    " 'corright',\n",
    " 'da pau',\n",
    " 'da pao',\n",
    " 'da bao',\n",
    " 'damn',\n",
    " 'dao',\n",
    " 'dey',\n",
    " 'diam',\n",
    " 'doneded',\n",
    " 'dont play play',\n",
    " 'du lan',\n",
    " 'dunnid',\n",
    " 'double confirm',\n",
    " 'echerly',\n",
    " 'eeyer',\n",
    " 'eye power',\n",
    " 'falali',\n",
    " 'ferlali',\n",
    " 'fatty bom bom',\n",
    " 'foto',\n",
    " 'fone',\n",
    " 'gabra',\n",
    " 'gahmen',\n",
    " 'geh geh',\n",
    " 'geh kiang',\n",
    " 'ger',\n",
    " 'get',\n",
    " 'gila',\n",
    " 'gone-case',\n",
    " 'goondu',\n",
    " 'gor chiam tua guay gu chia leng',\n",
    " 'gostan',\n",
    " 'guai lan',\n",
    " 'kwai lan',\n",
    " 'handphone',\n",
    " 'hao lian',\n",
    " 'heng',\n",
    " 'helication',\n",
    " 'horlan',\n",
    " 'ho liao',\n",
    " 'hong bao',\n",
    " 'hongbao',\n",
    " 'hong gan liao',\n",
    " 'hong kan liao',\n",
    " 'hosei',\n",
    " 'ho seh',\n",
    " 'ho say',\n",
    " 'ho seh bo',\n",
    " 'hosei liao',\n",
    " 'huat',\n",
    " 'hum ji',\n",
    " 'humji',\n",
    " 'hum chi',\n",
    " 'humchi',\n",
    " 'ini macam',\n",
    " 'jelak',\n",
    " 'jiak',\n",
    " 'jiak cao',\n",
    " 'jiak chao',\n",
    " 'jiak zua',\n",
    " 'jiak kantang',\n",
    " 'jiak liao bee',\n",
    " 'jia lat',\n",
    " 'jialat',\n",
    " 'jibaboom',\n",
    " 'jibabom',\n",
    " 'jibai',\n",
    " 'chee bye',\n",
    " 'ci bai',\n",
    " 'chee bai',\n",
    " 'jibra',\n",
    " 'jio',\n",
    " 'jilo',\n",
    " 'jiro',\n",
    " 'zilo',\n",
    " 'jiuhu',\n",
    " 'kae ang moh',\n",
    " 'kah kenna chiu chiu kenna kah',\n",
    " 'kenz',\n",
    " 'kanina',\n",
    " 'kar chng',\n",
    " 'kar pak',\n",
    " 'kaypoh',\n",
    " 'kee siao',\n",
    " 'keling kia',\n",
    " 'kena',\n",
    " 'kena sai',\n",
    " 'kerlik',\n",
    " 'khi chia',\n",
    " 'kiam',\n",
    " 'khiam pah',\n",
    " 'kiasi',\n",
    " 'kiasu',\n",
    " 'kong ka kiao',\n",
    " 'kopi',\n",
    " 'kopi tiam',\n",
    " 'kopi-tiam',\n",
    " 'kopitiam',\n",
    " 'koyah',\n",
    " 'ku ku jiao',\n",
    " 'lah',\n",
    " 'lan jiao',\n",
    " 'lao lan',\n",
    " 'lao pei huet',\n",
    " 'lao sai',\n",
    " 'la sai',\n",
    " 'lagi',\n",
    " 'leh',\n",
    " 'leh chey',\n",
    " 'lepak',\n",
    " 'liao',\n",
    " 'liek boh kiew',\n",
    " 'liak bo kiu',\n",
    " 'liddat',\n",
    " 'limbu',\n",
    " 'lim bu',\n",
    " 'limpeh',\n",
    " 'lim peh',\n",
    " 'limpei',\n",
    " 'lin lao hia',\n",
    " 'lin nao hia',\n",
    " 'lobang',\n",
    " 'lombang',\n",
    " 'long zhong',\n",
    " 'long zong',\n",
    " 'lor',\n",
    " 'luan',\n",
    " 'lun zun',\n",
    " 'mader',\n",
    " 'mafan',\n",
    " 'mah',\n",
    " 'mampat',\n",
    " 'mang zang',\n",
    " 'mata',\n",
    " 'meh',\n",
    " 'mong cha cha',\n",
    " 'mug',\n",
    " 'neh neh',\n",
    " 'neh neh pok',\n",
    " 'ngeow',\n",
    " 'nia',\n",
    " 'nia gong',\n",
    " 'nia gong de ji dan',\n",
    " 'no eye see',\n",
    " 'no horse run',\n",
    " 'nuah',\n",
    " 'obasan',\n",
    " 'obiang',\n",
    " 'orbit',\n",
    " 'orbi',\n",
    " 'ord',\n",
    " 'ord loh',\n",
    " 'owadio',\n",
    " 'orh',\n",
    " 'orh hor',\n",
    " 'oso',\n",
    " 'ownself',\n",
    " 'pai kia',\n",
    " 'pai seh',\n",
    " 'paiseh',\n",
    " 'pak chiu cheng pcc',\n",
    " 'pak zam',\n",
    " 'pak tor',\n",
    " 'paktor',\n",
    " 'pang chance',\n",
    " 'pang sai',\n",
    " 'pang seh',\n",
    " 'pang jio',\n",
    " 'pariah',\n",
    " 'pasar malam',\n",
    " 'photostat',\n",
    " 'pia',\n",
    " 'piak piak',\n",
    " 'pian yi dao lao sai',\n",
    " 'pok kai',\n",
    " 'pon',\n",
    " 'powderful',\n",
    " 'pai tao',\n",
    " 'puki',\n",
    " 'pundek',\n",
    " 'rabak',\n",
    " 'rabz',\n",
    " 'rabz-kebabz',\n",
    " 'sabo',\n",
    " 'sakar',\n",
    " 'saman',\n",
    " 'sampat',\n",
    " 'sam seng',\n",
    " 'sargen',\n",
    " 'scorching eagle',\n",
    " 'sei',\n",
    " 'see first',\n",
    " 'see me no up',\n",
    " 'see you very up',\n",
    " 'shame shame',\n",
    " 'shiok',\n",
    " 'showflat',\n",
    " 'siah',\n",
    " 'siam',\n",
    " 'sian',\n",
    " 'sien',\n",
    " 'siao',\n",
    " 'sia suay',\n",
    " 'sibeh',\n",
    " 'si beh',\n",
    " 'simi',\n",
    " 'si mi',\n",
    " 'x verb simi x',\n",
    " 'si mi lan jiao',\n",
    " 'si mi tai dzi',\n",
    " 'slow-poke',\n",
    " 'sod',\n",
    " 'solid',\n",
    " 'solid bird bird',\n",
    " 'song',\n",
    " 'sozai',\n",
    " 'sor zai',\n",
    " 'sotong',\n",
    " 'stay on topic',\n",
    " 'steady pom pi pi',\n",
    " 'suay',\n",
    " 'suka',\n",
    " 'suku',\n",
    " 'sui',\n",
    " 'swee',\n",
    " 'swaku',\n",
    " 'tahan',\n",
    " 'tai ko',\n",
    " 'tyco',\n",
    " 'tak boleh',\n",
    " 'tak boleh tahan',\n",
    " 'talk cock',\n",
    " 'tok kok',\n",
    " 'tan ku ku',\n",
    " 'ta pau',\n",
    " 'ta pao',\n",
    " 'tau pok',\n",
    " 'teh',\n",
    " 'terbalik',\n",
    " 'tembalik',\n",
    " 'terok',\n",
    " 'thambi',\n",
    " 'tambi',\n",
    " 'thiam',\n",
    " 'tio',\n",
    " 'tir ko pek',\n",
    " 'toot',\n",
    " 'towkay',\n",
    " 'tow kay',\n",
    " 'tsai',\n",
    " 'zai',\n",
    " 'tuang',\n",
    " 'twa kee',\n",
    " 'ulu',\n",
    " 'un',\n",
    " 'understooded',\n",
    " 'up lorry',\n",
    " 'vomit blood',\n",
    " 'wah lan eh',\n",
    " 'wah lao eh',\n",
    " 'walao eh',\n",
    " 'wah lau eh',\n",
    " 'walau eh',\n",
    " 'wah kao',\n",
    " 'wakao',\n",
    " 'wah seh',\n",
    " 'wayang',\n",
    " 'where got',\n",
    " 'white horse',\n",
    " 'womit',\n",
    " 'xia lan',\n",
    " 'xia suay',\n",
    " 'sia suay',\n",
    " 'yandao',\n",
    " 'yaya papaya',\n",
    " 'your grandfathers road ah',\n",
    " 'your head',\n",
    " 'zai',\n",
    " 'zao hor',\n",
    " 'zao kng',\n",
    " 'zha bo',\n",
    " 'zhun',\n",
    " 'zhun bo',\n",
    " 'zilo',\n",
    " 'zi siao',\n",
    " 'char kway teow',\n",
    " 'chwee kueh',\n",
    " 'hokkien char mee',\n",
    " 'hokkien hae mee',\n",
    " 'kaya',\n",
    " 'roti-kaya',\n",
    " 'otah',\n",
    " 'popiah',\n",
    " 'rojak',\n",
    " 'roti john',\n",
    " 'tauge',\n",
    " 'taoge',\n",
    " 'taugeh',\n",
    " 'taugey',\n",
    " 'tau gee',\n",
    " 'tze char',\n",
    " 'teh',\n",
    " 'teh-o',\n",
    " 'teh-o-ice-limau',\n",
    " 'teh-c',\n",
    " 'teh-cino',\n",
    " 'teh-peng',\n",
    " 'teh-poh',\n",
    " 'teh-kosong',\n",
    " 'teh-kah-dai',\n",
    " 'teh-siu-dai',\n",
    " 'teh-pua seo',\n",
    " 'teh-o-kah-dai',\n",
    " 'teh-o-siu-dai',\n",
    " 'teh-c-kah-dai',\n",
    " 'teh-c-siu-dai',\n",
    " 'teh-packet or teh-pao',\n",
    " 'teh-tarik',\n",
    " 'teh-halia',\n",
    " 'teh-halia tarik',\n",
    " 'tiao he',\n",
    " 'tiau hir',\n",
    " 'kopi',\n",
    " 'kopi-o',\n",
    " 'kopi-c',\n",
    " 'kopi kosong',\n",
    " 'kopi-peng',\n",
    " 'kopi-packet',\n",
    " 'kopi-pao',\n",
    " 'kopi-pua seo',\n",
    " 'kopi-gao',\n",
    " 'kopi-poh',\n",
    " 'kopi-kah-dai',\n",
    " 'kopi-siu-dai',\n",
    " 'bandung drink',\n",
    " 'ice kosong',\n",
    " 'horlick-dinosaur',\n",
    " 'horlick-sio',\n",
    " 'horlick-peng',\n",
    " 'milo-sio',\n",
    " 'milo dinosaur',\n",
    " 'milo-peng',\n",
    " 'tak kiu',\n",
    " 'tak kiu-peng',\n",
    " 'arrow',\n",
    " 'auntie',\n",
    " 'banana',\n",
    " 'blur',\n",
    " 'blur like sotong',\n",
    " 'confirm',\n",
    " 'confirm plus guarantee got chop',\n",
    " 'confirm plus chop',\n",
    " 'help lah',\n",
    " 'on off',\n",
    " 'on ah',\n",
    " 'pass up',\n",
    " 'prc',\n",
    " 'dont fly my kite',\n",
    " 'aeroplane',\n",
    " 'dont play play',\n",
    " 'got problem ah',\n",
    " 'having here',\n",
    " 'he still small boy one',\n",
    " 'issit',\n",
    " 'izzit',\n",
    " 'last time policemen wear shorts',\n",
    " 'liddat oso can',\n",
    " 'my england not powderful',\n",
    " 'no fish prawn oso can',\n",
    " 'not happy talk outside',\n",
    " 'on lah',\n",
    " 'relak lah',\n",
    " 'then you know',\n",
    " 'why you so liddat ar',\n",
    " 'you think he think who confirm',\n",
    " 'you think i thought who confirm',\n",
    " 'you want 10 cent',\n",
    " 'your grandfathers place',\n",
    " 'road ah your father own this place',\n",
    " 'road',\n",
    " 'you play where one',\n",
    " 'singapore english',\n",
    " ''\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "manglish_vocab = manglish_vocab | set(cleaned)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'',\n",
       " '11b',\n",
       " '4d',\n",
       " '5cs',\n",
       " 'abit',\n",
       " 'abuden',\n",
       " 'acbc',\n",
       " 'act blur',\n",
       " 'act cute',\n",
       " 'aeroplane',\n",
       " 'agak-ration',\n",
       " 'ah beng',\n",
       " 'ah gua',\n",
       " 'ah huay',\n",
       " 'ah kwa',\n",
       " 'ah lian',\n",
       " 'ah long',\n",
       " 'ah neh',\n",
       " 'ah pu neh neh',\n",
       " 'ah qua',\n",
       " 'ah seng',\n",
       " 'ah tiong',\n",
       " 'ai see',\n",
       " 'ai see buay see',\n",
       " 'ai sui',\n",
       " 'ai swee mai mniah',\n",
       " 'ai tzai',\n",
       " 'ai-yoh-yoh',\n",
       " 'aiseh',\n",
       " 'aiya',\n",
       " 'aiyah',\n",
       " 'aiyo',\n",
       " 'aiyoh',\n",
       " 'aiyoyo',\n",
       " 'amoi',\n",
       " 'an zhua',\n",
       " 'ang moh',\n",
       " 'ang moh pai',\n",
       " 'ang pau',\n",
       " 'ang pow',\n",
       " 'angmor',\n",
       " 'angpau',\n",
       " 'ar',\n",
       " 'ar bo',\n",
       " 'arrow',\n",
       " 'auntie',\n",
       " 'ay',\n",
       " 'aye',\n",
       " 'ayy',\n",
       " 'bak kwa',\n",
       " 'bakkwa',\n",
       " 'banana',\n",
       " 'bandung drink',\n",
       " 'bao toh',\n",
       " 'barang barang',\n",
       " 'basket',\n",
       " 'batam',\n",
       " 'belanja',\n",
       " 'beng',\n",
       " 'berak',\n",
       " 'bird bird',\n",
       " 'blur',\n",
       " 'blur like sotong',\n",
       " 'boh beh zao',\n",
       " 'boh bian',\n",
       " 'boh chup',\n",
       " 'boh eng',\n",
       " 'boh gay',\n",
       " 'boh geh',\n",
       " 'boh jio',\n",
       " 'boh liao',\n",
       " 'boh pien',\n",
       " 'boh ta bo lan pa',\n",
       " 'boh tao boh beh',\n",
       " 'boh zheng hu',\n",
       " 'bojio',\n",
       " 'buay',\n",
       " 'buay gan',\n",
       " 'buay pai',\n",
       " 'buay song',\n",
       " 'buay steady',\n",
       " 'buay tahan',\n",
       " 'buiji',\n",
       " 'buji',\n",
       " 'cao',\n",
       " 'catch no ball',\n",
       " 'cb',\n",
       " 'cert',\n",
       " 'cha',\n",
       " 'chao',\n",
       " 'chao keng',\n",
       " 'chao mugger',\n",
       " 'chap lau chu',\n",
       " 'char bor',\n",
       " 'char kway teow',\n",
       " 'chee bai',\n",
       " 'chee bye',\n",
       " 'chee ko pek',\n",
       " 'cheem',\n",
       " 'cheena',\n",
       " 'cher',\n",
       " 'chey',\n",
       " 'cheyyy',\n",
       " 'chi fan',\n",
       " 'chicken business',\n",
       " 'chim',\n",
       " 'chin chai',\n",
       " 'chio bu',\n",
       " 'chiobu',\n",
       " 'chiong',\n",
       " 'chiong sua',\n",
       " 'chiu',\n",
       " 'chiu kana kah kah kana lum pah',\n",
       " 'chop',\n",
       " 'chop chop',\n",
       " 'chope',\n",
       " 'chow',\n",
       " 'chui',\n",
       " 'chun',\n",
       " 'chwee kueh',\n",
       " 'ci bai',\n",
       " 'ciao',\n",
       " 'cibai',\n",
       " 'cilok',\n",
       " 'cmi',\n",
       " 'cockanaathan',\n",
       " 'confirm',\n",
       " 'confirm plus chop',\n",
       " 'confirm plus guarantee got chop',\n",
       " 'corright',\n",
       " 'da bao',\n",
       " 'da pao',\n",
       " 'da pau',\n",
       " 'dabao',\n",
       " 'damn',\n",
       " 'dao',\n",
       " 'dey',\n",
       " 'diam',\n",
       " 'doneded',\n",
       " 'dont fly my kite',\n",
       " 'dont play play',\n",
       " 'double confirm',\n",
       " 'du lan',\n",
       " 'dunnid',\n",
       " 'echerly',\n",
       " 'eeyer',\n",
       " 'entao',\n",
       " 'eye power',\n",
       " 'falali',\n",
       " 'fatty bom bom',\n",
       " 'ferlali',\n",
       " 'fone',\n",
       " 'foto',\n",
       " 'gabra',\n",
       " 'gahmen',\n",
       " 'geh geh',\n",
       " 'geh kiang',\n",
       " 'ger',\n",
       " 'get',\n",
       " 'gila',\n",
       " 'gone-case',\n",
       " 'goondu',\n",
       " 'gor chiam tua guay gu chia leng',\n",
       " 'gostan',\n",
       " 'got problem ah',\n",
       " 'guai lan',\n",
       " 'guo',\n",
       " 'gwai',\n",
       " 'handphone',\n",
       " 'hao',\n",
       " 'hao lian',\n",
       " 'having here',\n",
       " 'he still small boy one',\n",
       " 'helication',\n",
       " 'help lah',\n",
       " 'heng',\n",
       " 'ho liao',\n",
       " 'ho say',\n",
       " 'ho seh',\n",
       " 'ho seh bo',\n",
       " 'hokkien char mee',\n",
       " 'hokkien hae mee',\n",
       " 'hong bao',\n",
       " 'hong gan liao',\n",
       " 'hong kan liao',\n",
       " 'hongbao',\n",
       " 'hor',\n",
       " 'horlan',\n",
       " 'horlick-dinosaur',\n",
       " 'horlick-peng',\n",
       " 'horlick-sio',\n",
       " 'hoseh',\n",
       " 'hosei',\n",
       " 'hosei liao',\n",
       " 'hou',\n",
       " 'huat',\n",
       " 'hum chi',\n",
       " 'hum ji',\n",
       " 'humchi',\n",
       " 'humji',\n",
       " 'ice kosong',\n",
       " 'ini macam',\n",
       " 'issit',\n",
       " 'izzit',\n",
       " 'jelak',\n",
       " 'jia lat',\n",
       " 'jiak',\n",
       " 'jiak cao',\n",
       " 'jiak chao',\n",
       " 'jiak kantang',\n",
       " 'jiak liao bee',\n",
       " 'jiak zua',\n",
       " 'jialat',\n",
       " 'jiang',\n",
       " 'jibabom',\n",
       " 'jibaboom',\n",
       " 'jibai',\n",
       " 'jibra',\n",
       " 'jilo',\n",
       " 'jin',\n",
       " 'jio',\n",
       " 'jiro',\n",
       " 'jiuhu',\n",
       " 'kae ang moh',\n",
       " 'kah kenna chiu chiu kenna kah',\n",
       " 'kai',\n",
       " 'kam',\n",
       " 'kampung',\n",
       " 'kanina',\n",
       " 'kantoi',\n",
       " 'kaobu',\n",
       " 'kapcai',\n",
       " 'kapster',\n",
       " 'kar chng',\n",
       " 'kar pak',\n",
       " 'kawkaw',\n",
       " 'kaya',\n",
       " 'kaypoh',\n",
       " 'kee siao',\n",
       " 'keling kia',\n",
       " 'kena',\n",
       " 'kena sai',\n",
       " 'kenasai',\n",
       " 'kenz',\n",
       " 'kerlik',\n",
       " 'khi chia',\n",
       " 'khiam pah',\n",
       " 'kia',\n",
       " 'kiam',\n",
       " 'kiang',\n",
       " 'kiasi',\n",
       " 'kiasu',\n",
       " 'kong ka kiao',\n",
       " 'kongmong',\n",
       " 'kopi',\n",
       " 'kopi kosong',\n",
       " 'kopi tiam',\n",
       " 'kopi-c',\n",
       " 'kopi-gao',\n",
       " 'kopi-kah-dai',\n",
       " 'kopi-o',\n",
       " 'kopi-packet',\n",
       " 'kopi-pao',\n",
       " 'kopi-peng',\n",
       " 'kopi-poh',\n",
       " 'kopi-pua seo',\n",
       " 'kopi-siu-dai',\n",
       " 'kopi-tiam',\n",
       " 'kopitiam',\n",
       " 'koyah',\n",
       " 'ku ku jiao',\n",
       " 'kwai lan',\n",
       " 'la',\n",
       " 'la sai',\n",
       " 'lagi',\n",
       " 'lah',\n",
       " 'lalazai',\n",
       " 'lan jiao',\n",
       " 'lan si',\n",
       " 'lancau',\n",
       " 'lanjiao',\n",
       " 'lansi',\n",
       " 'lao lan',\n",
       " 'lao pei huet',\n",
       " 'lao sai',\n",
       " 'laobu',\n",
       " 'last time policemen wear shorts',\n",
       " 'leh',\n",
       " 'leh chey',\n",
       " 'lepak',\n",
       " 'leybit',\n",
       " 'liak bo kiu',\n",
       " 'liao',\n",
       " 'liddat',\n",
       " 'liddat oso can',\n",
       " 'liek boh kiew',\n",
       " 'lim bu',\n",
       " 'lim peh',\n",
       " 'limbu',\n",
       " 'limpeh',\n",
       " 'limpei',\n",
       " 'lin lao hia',\n",
       " 'lin nao hia',\n",
       " 'lobang',\n",
       " 'lombang',\n",
       " 'long zhong',\n",
       " 'long zong',\n",
       " 'lor',\n",
       " 'luan',\n",
       " 'lun zun',\n",
       " 'machiam',\n",
       " 'mader',\n",
       " 'mafan',\n",
       " 'mah',\n",
       " 'maluation',\n",
       " 'mampat',\n",
       " 'mang zang',\n",
       " 'mata',\n",
       " 'meh',\n",
       " 'milo dinosaur',\n",
       " 'milo-peng',\n",
       " 'milo-sio',\n",
       " 'moh',\n",
       " 'momantai',\n",
       " 'mong cha cha',\n",
       " 'mug',\n",
       " 'my england not powderful',\n",
       " 'neh neh',\n",
       " 'neh neh pok',\n",
       " 'nehmind',\n",
       " 'ngaidi',\n",
       " 'ngaidiao',\n",
       " 'ngeow',\n",
       " 'nia',\n",
       " 'nia gong',\n",
       " 'nia gong de ji dan',\n",
       " 'nian',\n",
       " 'niang',\n",
       " 'no eye see',\n",
       " 'no fish prawn oso can',\n",
       " 'no horse run',\n",
       " 'not happy talk outside',\n",
       " 'nuah',\n",
       " 'obasan',\n",
       " 'obiang',\n",
       " 'on ah',\n",
       " 'on lah',\n",
       " 'on off',\n",
       " 'orbi',\n",
       " 'orbit',\n",
       " 'ord',\n",
       " 'ord loh',\n",
       " 'oredy',\n",
       " 'orh',\n",
       " 'orh hor',\n",
       " 'orredy',\n",
       " 'oso',\n",
       " 'otah',\n",
       " 'owadio',\n",
       " 'ownself',\n",
       " 'pai kia',\n",
       " 'pai seh',\n",
       " 'pai tao',\n",
       " 'paikia',\n",
       " 'paiseh',\n",
       " 'pak chiu cheng pcc',\n",
       " 'pak tor',\n",
       " 'pak zam',\n",
       " 'paktor',\n",
       " 'pang chance',\n",
       " 'pang jio',\n",
       " 'pang sai',\n",
       " 'pang seh',\n",
       " 'pantang',\n",
       " 'pariah',\n",
       " 'pasar malam',\n",
       " 'pass up',\n",
       " 'photostat',\n",
       " 'pia',\n",
       " 'piak piak',\n",
       " 'pian yi dao lao sai',\n",
       " 'pok kai',\n",
       " 'pokai',\n",
       " 'pokkai',\n",
       " 'pon',\n",
       " 'popiah',\n",
       " 'powderful',\n",
       " 'prc',\n",
       " 'puki',\n",
       " 'pundek',\n",
       " 'rabak',\n",
       " 'rabz',\n",
       " 'rabz-kebabz',\n",
       " 'relak lah',\n",
       " 'road',\n",
       " 'road ah your father own this place',\n",
       " 'rojak',\n",
       " 'roti john',\n",
       " 'roti-kaya',\n",
       " 'sabo',\n",
       " 'sakar',\n",
       " 'sam seng',\n",
       " 'saman',\n",
       " 'sampat',\n",
       " 'sargen',\n",
       " 'scorching eagle',\n",
       " 'see first',\n",
       " 'see me no up',\n",
       " 'see you very up',\n",
       " 'seh',\n",
       " 'sei',\n",
       " 'seow',\n",
       " 'shame shame',\n",
       " 'shiok',\n",
       " 'showflat',\n",
       " 'si beh',\n",
       " 'si mi',\n",
       " 'si mi lan jiao',\n",
       " 'si mi tai dzi',\n",
       " 'sia',\n",
       " 'sia suay',\n",
       " 'siah',\n",
       " 'siam',\n",
       " 'sian',\n",
       " 'siao',\n",
       " 'siasui',\n",
       " 'sibeh',\n",
       " 'sibei',\n",
       " 'sien',\n",
       " 'simi',\n",
       " 'singapore armed forces identity card',\n",
       " 'singapore english',\n",
       " 'siol',\n",
       " 'slow-poke',\n",
       " 'smlj',\n",
       " 'soc',\n",
       " 'sod',\n",
       " 'sohai',\n",
       " 'solid',\n",
       " 'solid bird bird',\n",
       " 'song',\n",
       " 'sor zai',\n",
       " 'sotong',\n",
       " 'sozai',\n",
       " 'stay on topic',\n",
       " 'steady pom pi pi',\n",
       " 'stim',\n",
       " 'suan',\n",
       " 'suay',\n",
       " 'sueh',\n",
       " 'sui',\n",
       " 'suka',\n",
       " 'suku',\n",
       " 'swaku',\n",
       " 'swee',\n",
       " 'syok',\n",
       " 'ta pao',\n",
       " 'ta pau',\n",
       " 'tahan',\n",
       " 'tai ko',\n",
       " 'tak boleh',\n",
       " 'tak boleh tahan',\n",
       " 'tak kiu',\n",
       " 'tak kiu-peng',\n",
       " 'talk cock',\n",
       " 'tambi',\n",
       " 'tan ku ku',\n",
       " 'tangi',\n",
       " 'tao',\n",
       " 'taoge',\n",
       " 'tapau',\n",
       " 'tau gee',\n",
       " 'tau pok',\n",
       " 'tauge',\n",
       " 'taugeh',\n",
       " 'taugey',\n",
       " 'tcher',\n",
       " 'teh',\n",
       " 'teh-c',\n",
       " 'teh-c-kah-dai',\n",
       " 'teh-c-siu-dai',\n",
       " 'teh-cino',\n",
       " 'teh-halia',\n",
       " 'teh-halia tarik',\n",
       " 'teh-kah-dai',\n",
       " 'teh-kosong',\n",
       " 'teh-o',\n",
       " 'teh-o-ice-limau',\n",
       " 'teh-o-kah-dai',\n",
       " 'teh-o-siu-dai',\n",
       " 'teh-packet or teh-pao',\n",
       " 'teh-peng',\n",
       " 'teh-poh',\n",
       " 'teh-pua seo',\n",
       " 'teh-siu-dai',\n",
       " 'teh-tarik',\n",
       " 'tembalik',\n",
       " 'terbalik',\n",
       " 'terok',\n",
       " 'terrer',\n",
       " 'thambi',\n",
       " 'then you know',\n",
       " 'thiam',\n",
       " 'tiao he',\n",
       " 'tiau hir',\n",
       " 'tio',\n",
       " 'tir ko pek',\n",
       " 'tok kok',\n",
       " 'tombalek',\n",
       " 'toot',\n",
       " 'tow kay',\n",
       " 'towkay',\n",
       " 'tsai',\n",
       " 'tuang',\n",
       " 'twa kee',\n",
       " 'tyco',\n",
       " 'tze char',\n",
       " 'ulu',\n",
       " 'un',\n",
       " 'understooded',\n",
       " 'up lorry',\n",
       " 'vomit blood',\n",
       " 'wah kao',\n",
       " 'wah lan eh',\n",
       " 'wah lao eh',\n",
       " 'wah lau eh',\n",
       " 'wah seh',\n",
       " 'wakao',\n",
       " 'walao',\n",
       " 'walao eh',\n",
       " 'walau',\n",
       " 'walau eh',\n",
       " 'wan',\n",
       " 'wapiang',\n",
       " 'wayang',\n",
       " 'where got',\n",
       " 'white horse',\n",
       " 'why you so liddat ar',\n",
       " 'womit',\n",
       " 'wor',\n",
       " 'x verb simi x',\n",
       " 'xia lan',\n",
       " 'xia suay',\n",
       " 'yam',\n",
       " 'yandao',\n",
       " 'yao',\n",
       " 'yaya papaya',\n",
       " 'ying',\n",
       " 'you play where one',\n",
       " 'you think he think who confirm',\n",
       " 'you think i thought who confirm',\n",
       " 'you want 10 cent',\n",
       " 'your grandfathers place',\n",
       " 'your grandfathers road ah',\n",
       " 'your head',\n",
       " 'zai',\n",
       " 'zao hor',\n",
       " 'zao kng',\n",
       " 'zha bo',\n",
       " 'zhun',\n",
       " 'zhun bo',\n",
       " 'zi siao',\n",
       " 'zilo'}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "manglish_vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/twitter/manglish.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-MalaysiaPolitics\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-MalaysianFood\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-MalaysianPF\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-SingaporeRaw\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-malaysia\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-malaysians\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/reddit/r-singapore\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/singlish/singlish.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/singlish/sg-news.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def language_detection_textcleaning(string):\n",
    "    string = re.sub(\n",
    "        'http\\\\S+|www.\\\\S+',\n",
    "        '',\n",
    "        ' '.join(\n",
    "            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]\n",
    "        ),\n",
    "    )\n",
    "\n",
    "    chars = ',.()!:\\'\"/;=-'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "    string = string.replace('\\n', '').replace('\\t', '')\n",
    "\n",
    "    string = re.sub(\n",
    "        '[0-9!@#$%^&*()_\\\\-+{}|\\\\~`\\'\";:?/.>,<]', ' ', string, flags=re.UNICODE\n",
    "    )\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "\n",
    "    return string.lower()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'a'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "language_detection_textcleaning('a')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "import malaya\n",
    "language_detection = malaya.language_detection.fasttext(model = 'mesolitica/fasttext-language-detection-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('singlish.txt') as fopen:\n",
    "    singlish = fopen.read().split('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = glob('r-*')\n",
    "for f in files:\n",
    "    with open(f) as fopen:\n",
    "        singlish.extend(fopen.read().split('\\n'))\n",
    "        \n",
    "singlish = [t for t in singlish if len(t)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "16698133it [00:34, 490565.81it/s]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "with open('/home/husein/ssd3/hardwarezone-sg/everything.jsonl') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l).strip()\n",
    "        if len(data):\n",
    "            singlish.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "231202it [00:00, 417909.17it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('/home/husein/ssd3/salary-sg/everything.jsonl') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l).strip()\n",
    "        if len(data):\n",
    "            singlish.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/home/husein/ssd3/crawl-lowyat/PolitiKs.json',\n",
       " '/home/husein/ssd3/crawl-lowyat/kopitiam.json',\n",
       " '/home/husein/ssd3/crawl-lowyat/SeriousKopitiam.json',\n",
       " '/home/husein/ssd3/crawl-lowyat/PropertyTalk.json']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lowyat = glob('/home/husein/ssd3/crawl-lowyat/*.json')\n",
    "lowyat = [f for f in lowyat if '-topics' not in f]\n",
    "lowyat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "for f in lowyat:\n",
    "    with open(f) as fopen:\n",
    "        data = json.load(fopen)\n",
    "    singlish.extend(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "41670469"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(singlish)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████| 41670469/41670469 [08:51<00:00, 78331.97it/s]\n"
     ]
    }
   ],
   "source": [
    "singlish = [language_detection_textcleaning(t) for t in tqdm(singlish)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "def _pad_sequence(\n",
    "    sequence,\n",
    "    n,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    sequence = iter(sequence)\n",
    "    if pad_left:\n",
    "        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)\n",
    "    if pad_right:\n",
    "        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))\n",
    "    return sequence\n",
    "\n",
    "def ngrams(\n",
    "    sequence,\n",
    "    n: int,\n",
    "    pad_left = False,\n",
    "    pad_right = False,\n",
    "    left_pad_symbol = None,\n",
    "    right_pad_symbol = None,\n",
    "):\n",
    "    sequence = _pad_sequence(\n",
    "        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol\n",
    "    )\n",
    "\n",
    "    history = []\n",
    "    while n > 1:\n",
    "        try:\n",
    "            next_item = next(sequence)\n",
    "        except StopIteration:\n",
    "            return\n",
    "        history.append(next_item)\n",
    "        n -= 1\n",
    "    for item in sequence:\n",
    "        history.append(item)\n",
    "        yield tuple(history)\n",
    "        del history[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['for those interested in',\n",
       " 'getting the latest online shopping deals',\n",
       " 'sharing the latest deals you find',\n",
       " 'sharing promo codes from websites online shopping platforms apps etc',\n",
       " 'spending smart and saving smarter',\n",
       " 'let s post here',\n",
       " 'waiting for promo code',\n",
       " 'mai la',\n",
       " 'ownself post ownself song',\n",
       " 'power lah even shopback come to edmw riao']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "singlish[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "fopen_manglish = open('prepare-manglish-manglish.jsonl', 'w')\n",
    "fopen_en = open('prepare-manglish-en.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████| 41670469/41670469 [29:25<00:00, 23603.13it/s]\n"
     ]
    }
   ],
   "source": [
    "for s in tqdm(singlish):\n",
    "    label = language_detection.predict([s])[0]\n",
    "    if label in {'malay', 'ind', 'other', 'rojak'}:\n",
    "        continue\n",
    "    splitted = s.split()\n",
    "    ngs = set(splitted)\n",
    "    ngs |= set([' '.join(n) for n in ngrams(splitted, 2)])\n",
    "    ngs |= set([' '.join(n) for n in ngrams(splitted, 3)])\n",
    "    ngs |= set([' '.join(n) for n in ngrams(splitted, 4)])\n",
    "    if len(ngs & manglish_vocab):\n",
    "        fopen_manglish.write(f'{json.dumps(s)}\\n')\n",
    "    else:\n",
    "        fopen_en.write(f'{json.dumps(s)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "fopen_manglish.close()\n",
    "fopen_en.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8546147 prepare-manglish-manglish.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l prepare-manglish-manglish.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"mai la\"\r\n",
      "\"ownself post ownself song\"\r\n",
      "\"power lah even shopback come to edmw riao\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 prepare-manglish-manglish.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "30678618 prepare-manglish-en.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l prepare-manglish-en.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"for those interested in\"\r\n",
      "\"getting the latest online shopping deals\"\r\n",
      "\"sharing the latest deals you find\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 3 prepare-manglish-en.jsonl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
